date()
## [1] "Thu Nov 19 18:02:50 2020"
The data set this week is about: Housing values in suburbs of Boston (source:https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html) there are several variables that affect the housing values such as: criminality, infrastructure, density, size of the house…
crim: per capita crime rate by town.
zn: proportion of residential land zoned for lots over 25,000 sq.ft.
indus: proportion of non-retail business acres per town.
chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
nox: nitrogen oxides concentration (parts per 10 million).
rm: average number of rooms per dwelling.
age: proportion of owner-occupied units built prior to 1940.
dis: weighted mean of distances to five Boston employment centres.
rad: index of accessibility to radial highways.
tax: full-value property-tax rate per $10,000.
ptratio: pupil-teacher ratio by town.
black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
lstat: lower status of the population (percent).
medv: median value of owner-occupied homes in $1000s.
library(MASS)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library (plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data("Boston")
dim(Boston)
## [1] 506 14
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
cor<-cor(Boston, method = "pearson", use = "complete.obs")
round(cor, 2)
## crim zn indus chas nox rm age dis rad tax ptratio
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
## age 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
## dis -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
## rad 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
## tax 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
## ptratio 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## black -0.39 0.18 -0.36 0.05 -0.38 0.13 -0.27 0.29 -0.44 -0.44 -0.18
## lstat 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## medv -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## black lstat medv
## crim -0.39 0.46 -0.39
## zn 0.18 -0.41 0.36
## indus -0.36 0.60 -0.48
## chas 0.05 -0.05 0.18
## nox -0.38 0.59 -0.43
## rm 0.13 -0.61 0.70
## age -0.27 0.60 -0.38
## dis 0.29 -0.50 0.25
## rad -0.44 0.49 -0.38
## tax -0.44 0.54 -0.47
## ptratio -0.18 0.37 -0.51
## black 1.00 -0.37 0.33
## lstat -0.37 1.00 -0.74
## medv 0.33 -0.74 1.00
corrplot.mixed(cor, lower = "ellipse", upper="number", tl.col = "black", tl.srt = 45)
Bigger houses are more expensive (number of rooms r=0.7), the house value is lower for lower status people (r=0.74). There are intercorrelation between the variables that can explain the houses values.
Here we subtract the column means from the corresponding columns and divide the difference with standard deviation, all the variables have a mean=0.
boston_scaled <- scale(Boston)
summary(boston_scaled)
## crim zn indus chas
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563 Min. :-0.2723
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668 1st Qu.:-0.2723
## Median :-0.390280 Median :-0.48724 Median :-0.2109 Median :-0.2723
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150 3rd Qu.:-0.2723
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202 Max. : 3.6648
## nox rm age dis
## Min. :-1.4644 Min. :-3.8764 Min. :-2.3331 Min. :-1.2658
## 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366 1st Qu.:-0.8049
## Median :-0.1441 Median :-0.1084 Median : 0.3171 Median :-0.2790
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059 3rd Qu.: 0.6617
## Max. : 2.7296 Max. : 3.5515 Max. : 1.1164 Max. : 3.9566
## rad tax ptratio black
## Min. :-0.9819 Min. :-1.3127 Min. :-2.7047 Min. :-3.9033
## 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876 1st Qu.: 0.2049
## Median :-0.5225 Median :-0.4642 Median : 0.2746 Median : 0.3808
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058 3rd Qu.: 0.4332
## Max. : 1.6596 Max. : 1.7964 Max. : 1.6372 Max. : 0.4406
## lstat medv
## Min. :-1.5296 Min. :-1.9063
## 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 3.5453 Max. : 2.9865
class(boston_scaled)
## [1] "matrix"
boston_scaled <- as.data.frame(boston_scaled)
creating a quantile vector of crime
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
Dividing the dataset to train and test sets, so that 80% of the data belongs to the train set.
ncol <- nrow(boston_scaled)
ind <- sample(ncol, size = ncol * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
Fitting the linear discriminant analysis (LDA) and LDA (bi)plot.
LDA is a classification (and dimension reduction) method. It finds the (linear) combination of the variables that separate the target variable classes. The target can be binary or multiclass variable.
lda.fit <- lda(crime ~ ., data = train)
lda.arrows <- function(x, myscale = 3, arrow_heads = 0.1, color = "orange", tex = 1.25, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)}# the function for lda biplot arrows (datacamp)
classes <- as.numeric(train$crime)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 3)
The plot shows that there are more crimes in areas with close access to the highway and in the residential zones, as showed in the correlation matrix.
correct_classes<-test$crime
test <- dplyr::select(test, -crime)
lda.pred <- predict(lda.fit, newdata = test) #Test=the rest=20%
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 16 10 2 0
## med_low 3 19 2 0
## med_high 1 7 19 0
## high 0 0 1 22
Similar to the LDA, more crimes are expected in high and low class neighbourhoods.
The optimal number of clusters is -> 2
data("Boston")
boston_scaled <- as.data.frame(scale(Boston))
dis<-dist(boston_scaled)
summary(dis)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
set.seed(123)
k_max <- 20
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})# calculate the total within sum of squares
qplot(x = 1:k_max, y = twcss, geom = 'line')
Visualization of the clusters per variable:
km <-kmeans(boston_scaled, centers = 2)
ggpairs(boston_scaled, mapping = aes(colour=as.factor(km$cluster)), legend = 1,
upper = list(continuous =wrap("cor", size=3)),
title="clusters overview",
lower = list(combo = wrap("facethist",size=0.1, bins = 20, alpha=0.3)))+
theme(legend.position="bottom")
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
km2 <-kmeans(boston_scaled, centers= 3)
boston_scaled$km_cluster<-km2$cluster
lda.fit2 <- lda(km_cluster ~ ., data = boston_scaled)# linear discriminant analysis
lda.fit2
## Call:
## lda(km_cluster ~ ., data = boston_scaled)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3241107 0.4664032 0.2094862
##
## Group means:
## crim zn indus chas nox rm
## 1 0.8046456 -0.4872402 1.117990 0.01575144 1.1253988 -0.46443119
## 2 -0.3760908 -0.3417123 -0.296848 0.01127561 -0.3345884 -0.09228038
## 3 -0.4075892 1.5146367 -1.068814 -0.04947434 -0.9962503 0.92400834
## age dis rad tax ptratio black
## 1 0.79737580 -0.85425848 1.2219249 1.2954050 0.60580719 -0.6407268
## 2 -0.02966623 0.05695857 -0.5803944 -0.6030198 -0.08691245 0.2863040
## 3 -1.16762641 1.19486951 -0.5983266 -0.6616391 -0.74378342 0.3538816
## lstat medv
## 1 0.8719904 -0.68418954
## 2 -0.1801190 0.03577844
## 3 -0.9480974 0.97889973
##
## Coefficients of linear discriminants:
## LD1 LD2
## crim 0.03134296 0.14880455
## zn 0.06381527 1.22350515
## indus -0.61086696 0.10402980
## chas -0.01953161 -0.03579238
## nox -1.00230143 0.70464917
## rm 0.16285767 0.44390394
## age 0.07220634 -0.59785382
## dis 0.04270475 0.45498614
## rad -0.71987743 0.02882054
## tax -0.98285440 0.70663319
## ptratio -0.22527977 0.15514668
## black 0.01693595 -0.03181845
## lstat -0.18274033 0.50122677
## medv 0.02892966 0.64244841
##
## Proportion of trace:
## LD1 LD2
## 0.8409 0.1591
lda biplot
classes <- as.numeric(boston_scaled$km_cluster)# target classes as numeric
plot(lda.fit2, dimen = 2, col = classes, pch = classes)# plot the lda results
lda.arrows(lda.fit, myscale = 3)
There differences between the clusters, cluster 3 is better explain by rad and tax variables, while cluster one by age.
creating K-means for the training data color by crime insidence
model_predictors <- dplyr::select(train, -crime)
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling # matrix multiplication
matrix_product <- as.data.frame(matrix_product)# matrix multiplication
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3,
color=train$crime, type= 'scatter3d', mode='markers')
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Creation of 2 clusters:
data("Boston")
boston_scaled <- as.data.frame(scale(Boston))
train <- boston_scaled[ind,]
str(train)
## 'data.frame': 404 obs. of 14 variables:
## $ crim : num -0.41354 -0.27973 0.00699 -0.41793 0.37059 ...
## $ zn : num 1.228 -0.487 -0.487 3.157 -0.487 ...
## $ indus : num -0.689 1.231 1.015 -1.018 1.015 ...
## $ chas : num 3.665 -0.272 -0.272 -0.272 -0.272 ...
## $ nox : num -0.929 0.434 0.244 -1.085 1.366 ...
## $ rm : num 0.6737 -0.583 0.0389 0.3293 -0.2898 ...
## $ age : num -1.267 0.925 -0.592 -1.452 0.562 ...
## $ dis : num 0.1342 -0.6502 0.0934 2.2511 -0.5117 ...
## $ rad : num -0.637 -0.522 1.66 -0.637 1.66 ...
## $ tax : num -0.9152 -0.0311 1.5294 -0.3396 1.5294 ...
## $ ptratio: num -0.395 -1.735 0.806 -0.257 0.806 ...
## $ black : num 0.441 -0.705 0.35 0.392 0.441 ...
## $ lstat : num -1.278 0.249 -0.29 -0.881 0.287 ...
## $ medv : num 1.0729 -0.5581 -0.1449 0.0617 -0.2754 ...
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
km3 <-kmeans(train, centers= 2)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3,
color=as.factor(km3$cluster), type= 'scatter3d', mode='markers')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Creation of 3 clusters:
data("Boston")
boston_scaled <- as.data.frame(scale(Boston))
train <- boston_scaled[ind,]
km4 <-kmeans(train, centers= 3)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3,
color=as.factor(km4$cluster), type= 'scatter3d', mode='markers')